In [69]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pandas.tools.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
In [2]:
data = np.loadtxt('pca-data-2d.dat', delimiter=' ', skiprows=0, usecols=range(0, 2))
data.shape
Out[2]:
In [3]:
data
Out[3]:
In [4]:
m = np.mean(data, 0)
m
Out[4]:
In [5]:
data_centered = np.subtract(data, m)
data_centered
Out[5]:
In [6]:
plt.figure()
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.xlabel("col1")
plt.ylabel("col2")
plt.grid()
plt.show()
In [7]:
covariance = np.cov(data_centered.T)
covariance
Out[7]:
In [8]:
evals, evecs = np.linalg.eig(covariance)
transmat = evecs.T
transmat
Out[8]:
In [9]:
evec1 = transmat[0] #evecs[:, 0]
evec2 = transmat[1] #evecs[:, 1]
evec1
Out[9]:
In [10]:
data_trans = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data_centered)):
data_trans[i] = np.dot(transmat, data_centered[i])
data_trans
Out[10]:
In [11]:
plt.figure(figsize=(10, 10))
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.plot([0, evec1[0]], [0, evec1[1]])
plt.plot([0, evec2[0]], [0, evec2[1]])
plt.scatter(data_trans.T[0], data_trans.T[1])
plt.grid()
plt.show()
In [12]:
transmat_inv = np.linalg.inv(transmat)
data_trans_inv = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
data_trans_inv[i] = np.dot(transmat_inv, data_trans[i])
data_trans_inv
data_trans_PC1 = np.copy(data_trans)
data_trans_PC1[:, 1] = 0
data_trans_inv_PC1 = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
data_trans_inv_PC1[i] = np.dot(transmat_inv, data_trans_PC1[i])
data_trans_PC2 = np.copy(data_trans)
data_trans_PC2[:, 0] = 0
data_trans_inv_PC2 = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
data_trans_inv_PC2[i] = np.dot(transmat_inv, data_trans_PC2[i])
data_trans_PC2
Out[12]:
In [13]:
plt.figure(figsize=(10, 10))
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.scatter(data_trans_inv_PC1.T[0], data_trans_inv_PC1.T[1])
plt.scatter(data_trans_inv_PC2.T[0], data_trans_inv_PC2.T[1])
red_patch = mpatches.Patch(color='red', label='full data')
blue_patch = mpatches.Patch(color='blue', label='only PC1')
green_patch = mpatches.Patch(color='green', label='only PC2')
plt.legend(handles=[red_patch, blue_patch, green_patch])
plt.grid()
plt.show()
In [14]:
data3d = np.loadtxt('pca-data-3d.txt', delimiter=',', skiprows=1)
data3d.shape # 3 axis 500 points
Out[14]:
In [15]:
mean3d = np.mean(data3d, 0)
data3d_centered = np.subtract(data3d, mean3d)
mean3d
Out[15]:
In [18]:
fig, axs = plt.subplots(3, 3, figsize=(9, 9))
for i in range(3):
for j in range(3):
axs[i][j].scatter(data3d_centered[:, i], data3d_centered[:, j])
plt.tight_layout(1)
axs[i][j].set_xlabel('col {}'.format(i+1))
axs[i][j].set_ylabel('col {}'.format(j+1))
fig.suptitle('Pairwise scatter plots of columns (x, y, z)', y=1.05, fontsize=16)
plt.show()
In [19]:
covariance3d = np.cov(data3d_centered.T)
evals3d, evecs3d = np.linalg.eig(covariance3d)
transmat3d = evecs3d.T
covariance3d
transmat3d
evals3d
# => Z is PC1 // Y is PC2 // X is PC3
Out[19]:
In [22]:
data3d_trans = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d_centered)):
data3d_trans[i] = np.dot(transmat3d, data3d_centered[i])
fig, axs = plt.subplots(3, 3, figsize=(9, 9))
for i in range(3):
for j in range(3):
axs[i][j].scatter(data3d_trans[:, i], data3d_trans[:, j])
plt.tight_layout(1)
axs[i][j].set_xlabel('col {}'.format(i+1))
axs[i][j].set_ylabel('col {}'.format(j+1))
fig.suptitle('Pairwise scatter plots of columns (PC1, PC2, PC3)', y=1.05, fontsize=16)
plt.show()
In [23]:
transmat3d_inv = np.linalg.inv(transmat3d)
data3d_trans_PC1 = np.copy(data3d_trans)
data3d_trans_PC1[:, 0] = 0
data3d_trans_PC1[:, 1] = 0
data3d_trans_PC1_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
data3d_trans_PC1_recov[i] = np.dot(transmat3d_inv, data3d_trans_PC1[i])
data3d_trans_PC12 = np.copy(data3d_trans)
data3d_trans_PC12[:, 0] = 0
data3d_trans_PC12_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
data3d_trans_PC12_recov[i] = np.dot(transmat3d_inv, data3d_trans_PC12[i])
data3d_trans_PC123_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
data3d_trans_PC123_recov[i] = np.dot(transmat3d_inv, data3d_trans[i])
data3d_trans_PC12[0, :]
Out[23]:
In [24]:
plt.figure(figsize=(10, 10))
plt.scatter(data3d_trans_PC123_recov.T[0], data3d_trans_PC123_recov.T[1])
plt.scatter(data3d_trans_PC12_recov.T[0], data3d_trans_PC12_recov.T[1])
plt.scatter(data3d_trans_PC1_recov.T[0], data3d_trans_PC1_recov.T[1])
blue_patch = mpatches.Patch(color='blue', label='PC123')
red_patch = mpatches.Patch(color='red', label='PC12')
green_patch = mpatches.Patch(color='green', label='PC1')
plt.legend(handles=[blue_patch, red_patch, green_patch])
plt.title('Scatter plot from x-y-layer')
plt.grid()
plt.show()
In [25]:
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data3d_trans_PC123_recov[:, 0], data3d_trans_PC123_recov[:, 1], data3d_trans_PC123_recov[:, 2])
ax.scatter(data3d_trans_PC12_recov[:, 0], data3d_trans_PC12_recov[:, 1], data3d_trans_PC12_recov[:, 2])
ax.scatter(data3d_trans_PC1_recov[:, 0], data3d_trans_PC1_recov[:, 1], data3d_trans_PC1_recov[:, 2])
blue_patch = mpatches.Patch(color='blue', label='PC123')
red_patch = mpatches.Patch(color='red', label='PC12')
green_patch = mpatches.Patch(color='green', label='PC1')
plt.legend(handles=[blue_patch, red_patch, green_patch])
plt.title('Recovered data points')
plt.show()
Using only the first PC is too little information to near or compress the data. Using the first two PCs similars the original data quite well.
In [26]:
data = np.loadtxt('expDat.txt', delimiter=',', skiprows=1, usecols=range(1, 21))
data.shape, data
Out[26]:
In [28]:
data_centered = data - data.mean(axis=0)
data_centered
Out[28]:
In [30]:
covariance = np.cov(data_centered.T)
covariance.shape
Out[30]:
In [41]:
evals, evecs = np.linalg.eig(covariance)
evecs.T
Out[41]:
In [ ]:
In [92]:
from scipy.ndimage import imread
import os
In [95]:
n_patches = []
b_patches = []
for img_name in os.listdir('imgpca'):
img = imread(os.path.join('imgpca', img_name))
for i in range(500):
x = np.random.randint(img.shape[0] - 16)
y = np.random.randint(img.shape[1] - 16)
patch = img[x:x+16, y:y+16].flatten()
if img_name.startswith('n'):
n_patches.append(patch)
elif img_name.startswith('b'):
b_patches.append(patch)
n_patches = np.array(n_patches)
b_patches = np.array(b_patches)
n_patches.shape, b_patches.shape
Out[95]:
In [96]:
# Show some nature patches.
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for ax in axes.flatten():
plt.sca(ax)
plt.imshow(n_patches[np.random.randint(len(n_patches))].reshape(16, 16), cmap='Greys', interpolation=None)
plt.axis('off')
In [97]:
# Show some building patches.
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for ax in axes.flatten():
plt.sca(ax)
plt.imshow(b_patches[np.random.randint(len(b_patches))].reshape(16, 16), cmap='Greys', interpolation=None)
plt.axis('off')
In [99]:
n_patches_centered = n_patches - n_patches.mean(axis=0)
b_patches_centered = b_patches - b_patches.mean(axis=0)
In [100]:
n_covariance = np.cov(n_patches_centered.T)
b_covariance = np.cov(b_patches_centered.T)
n_covariance.shape, b_covariance.shape
Out[100]:
In [101]:
n_evals, n_evecs = np.linalg.eig(n_covariance)
b_evals, b_evecs = np.linalg.eig(b_covariance)
n_evecs.T.shape, b_evecs.T.shape
Out[101]:
In [103]:
# Nature PCAs.
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
for i, ax in enumerate(axes.flatten()):
plt.sca(ax)
plt.imshow(n_evecs.T[i].reshape(16, 16), cmap='Greys', interpolation=None)
plt.axis('off')
In [104]:
# Building PCAs.
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
for i, ax in enumerate(axes.flatten()):
plt.sca(ax)
plt.imshow(b_evecs.T[i].reshape(16, 16), cmap='Greys', interpolation=None)
plt.axis('off')
The first few PCAs from building and nature images are similar, they represent basic shades and edges (first rows in the plots above). However, the PCAs from the second and third rows above seem different - for buildings, the lines are edgy and straight, while for nature images, they seem to have more natural shapes.
In [135]:
plt.plot(n_evals[:100], '.', label='nature')
plt.plot(b_evals[:100], '.', label='buildings')
plt.ylabel('Eigenvalue')
plt.xlabel('PC')
plt.legend()
plt.ylim(0, 80000)
Out[135]:
For simplicity, only the first 100 PCs are plotted and the first PC is not shown due to its high eigenvalue. For both image categories, one should keep around 20 PCs according to the Scree test. This represents a compression of 1 - (20/256) = 92 %.
In [ ]: